from sklearn.feature_extraction.text import CountVectorizer  # 词频向量化

doc = [
    "This's the first document. aa",
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

doc1 = [
    "This's the first document. bb",
]

# 最少2个字母；单个字符和符号都会被过滤掉
# stop_words 停用词io
cnt = CountVectorizer(stop_words=["aa",'bb', 'this', 'is','the'])

cnt.fit(doc)  # 拟合后 获取 特征名称

out = cnt.transform(doc).toarray()
print("向量化\n", out, type(out))  # <class 'scipy.sparse.csr.csr_matrix'>
print("特征名称", cnt.get_feature_names())

# out = cnt.transform(doc1).toarray()
# print("对第2组文本向量化\n", out)
# print("特征", cnt.get_feature_names())